1. Wykorzystaj rmarkdown do zbudowania dokumentu html, z opisem poniższych analiz.

  2. Wczytaj ten zbiór danych. Dla pierwszego zbuduj model pomiędzy wzrostem syna i ojca. Dla drugiego dla różnych genomów znajdują się w nim między innymi informacje o wielkości genomu oraz średnim współczynniku GC (udział zasad G lub C w genomie).

library(PBImisc)
library(ggplot2)
library(stringr)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
data(heights)
data(genomes)
help(heights)
  1. Przedstaw graficznie zależność pomiędzy cechami. Sprawdź czy transformacja (np. logarytm) nie poprawi liniowości modelu.

  2. Wyznacz model liniowy dla obu powyższych zależności używając funkcji lm().

  3. Użyj funkcji plot() aby wyznaczyć wykresy diagnostyczne.

  4. Użyj funkcji rstandard, rstudent, cooks.distance aby wyznaczyć reszty i miary wpływu.

Heights dataset

heights[1:4, ]
##   Husband Wife
## 1     186  175
## 2     180  168
## 3     160  154
## 4     186  166
ggplot(data=heights) + geom_histogram(aes(x=Husband))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=heights) + geom_histogram(aes(x=Wife))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

hgt_lm <- lm(Wife~Husband, data=heights)
hgts_model <- cbind(heights, data.frame('var'=heights$Husband, 'obs'=heights$Wife, 'fit'=hgt_lm$fit, 'rstudent'=rstudent(hgt_lm), 'hat'=hatvalues(hgt_lm), 'cooks'=cooks.distance(hgt_lm)))
hgts_model$'ID' <- 1:nrow(hgts_model)
error_var <- sum((hgts_model$obs - hgts_model$fit)^2)/(nrow(hgts_model))

Fitting results:

summary(hgt_lm)
## 
## Call:
## lm(formula = Wife ~ Husband, data = heights)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.4685  -3.9208   0.8301   3.9538  11.1287 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.93015   10.66162   3.933 0.000161 ***
## Husband      0.69965    0.06106  11.458  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.928 on 94 degrees of freedom
## Multiple R-squared:  0.5828, Adjusted R-squared:  0.5783 
## F-statistic: 131.3 on 1 and 94 DF,  p-value: < 2.2e-16
ggplot(data=hgts_model) + geom_point(aes(x=var, y=obs)) + geom_abline(intercept=hgt_lm$coef[1], slope=hgt_lm$coef[2], col="red")

The slope and intercept are statistically significant.

Diagnostics

Residues distribution:
qplot(hgts_model$fit - hgts_model$obs, geom="histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(hgts_model$rstudent, geom="histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

residual.data <- data.frame('res'=sort(hgts_model$rstudent))
residual.data$'emp.cdf' <- 1:nrow(residual.data)/nrow(residual.data)
residual.data$'th.quant' <- qnorm(residual.data$'emp.cdf')
ggplot(data=residual.data, aes(x=th.quant, y=res)) + geom_point() + geom_abline(col='red')

qqnorm(hgts_model$rstudent)

The residues have a fairly normal composition, but are skewed - the distribution of wifes’ height is more dispersed for lower heights.

Residues vs hatvalues:
ggplot(data=hgts_model) + geom_text(aes(x=hat, y=rstudent, label=ID)) + geom_abline(slope=0, col='red') + ggtitle("Residues vs leverages")

No significantly deviant observations with high leverage; Observations 39 and 72 a little suspicious.

Cook’s distance:
ggplot(data=hgts_model) + geom_text(aes(x=fit, y=cooks, label=ID))

ggplot(data=hgts_model) + geom_text(aes(x=rstudent, y=cooks, label=ID))

ggplot(data=hgts_model) + geom_text(aes(x=hat, y=cooks, label=ID))

Another potential outlier is the observation 12.

Visualisation of potential outliers
hgts_model$"Outlier" = factor(0, levels=c(0, 1))
hgts_model$"Outlier"[c(12, 72, 39)] = 1
ggplot(data=hgts_model) + geom_text(aes(x=var, y=obs, col=Outlier, label=ID)) + geom_abline(intercept=hgt_lm$coef[1], slope=hgt_lm$coef[2], col="red")

Observation 12 is a potential outlier, but since it’s surrounded by many observations it does not influence the model.

hgts_model[12,c("Husband", "Wife")]
##    Husband Wife
## 12     178  147

Genomes dataset

Scaling the data

data(genomes)
names(genomes)
## [1] "organism"    "group"       "size"        "GC"          "habitat"    
## [6] "temp.group"  "temperature"
#genomes$size <- genomes$size - mean(genomes$size)
genomes$size <- genomes$size/sd(genomes$size)
#genomes$GC <- genomes$GC - mean(genomes$GC)
genomes$GC <- genomes$GC/sd(genomes$GC)

Transformations:

qplot(genomes$size, genomes$GC)

qplot(log(genomes$size), genomes$GC)

qplot(sqrt(genomes$size), genomes$GC)

qplot(genomes$size, sqrt(genomes$GC))

Logarithmic and square root transformations of independent variable seem best. Taking square root of GC doesn’t change much so I’ll pass on it.

Models:

genome_lm <- lm(GC~size, data=genomes)
log_lm <- lm(GC~log(size), data=genomes)
sqrt_lm <- lm(GC~sqrt(size), data=genomes)
double_sqrt_lm <- lm(GC~sqrt(sqrt(size)), data=genomes)  # this transform was inspired by the diagnostic plots, which showed that the square root transform performed better than the log transform
models <- list('raw.size'=genome_lm, 'log.size'=log_lm, 'sqrt.size'=sqrt_lm, 'double_sqrt.size'=double_sqrt_lm)

Results:

genome_summaries <- lapply(models, function(m) as.data.frame(cbind("org"=factor(genomes$organism), "group"=factor(genomes$group), "size"=genomes$size, "GC"=genomes$GC, predict(m, interval="confidence"), 'hat'=hatvalues(m), 'res'=rstudent(m))))
genome_RSS <- lapply(models, function(m) sum(m$residuals^2))
for(model_name in names(models)){
  p <- ggplot(data=genome_summaries[[model_name]]) +geom_point(aes(x=size, y=GC, col=group)) + geom_line(aes(x=size, y=fit), col="red") + geom_ribbon(aes(x=size, ymin=lwr, ymax=upr), alpha=0.1, fill='orange') + ggtitle(model_name, subtitle=str_c("RSS = ", genome_RSS[[model_name]]))
  show(p)
}

The best fit (lowest RSS) is for the double square (i.e. fourth) root of the genome size. However, different groups seem to exhibit different dependencies, which suggests taking some test to check this. But first, let’s do some diagnostics:

for(n in names(models)){
  plot(models[[n]], sub.caption = n)
}

The dependence of residuals on fitted values is most linear for the square root and the double square root transform, which suggest that they are better than the log transform (even thoug the log transform has lower RSS than the square root transform). The variance seems to be heteroskedastic, being the largest for medium genome sizes (which may be because those genomes are probably the most common ones). There is one rather influential outlier for non-transformed data, which vanishes after taking any transformation.

More diagnostics

Breusch-Pagan (bptest): heteroskedasticity
for(n in names(models)){
  print(n)
  show(bptest(models[[n]]))
}
## [1] "raw.size"
## 
##  studentized Breusch-Pagan test
## 
## data:  models[[n]]
## BP = 3.4043, df = 1, p-value = 0.06503
## 
## [1] "log.size"
## 
##  studentized Breusch-Pagan test
## 
## data:  models[[n]]
## BP = 17.207, df = 1, p-value = 3.352e-05
## 
## [1] "sqrt.size"
## 
##  studentized Breusch-Pagan test
## 
## data:  models[[n]]
## BP = 8.0361, df = 1, p-value = 0.004585
## 
## [1] "double_sqrt.size"
## 
##  studentized Breusch-Pagan test
## 
## data:  models[[n]]
## BP = 12.451, df = 1, p-value = 0.0004179

The variance is indeed heteroskedastic (except maybe for non-transformed genome size, but it’s not a good model).

Goldfeld-Quandt (gqtest): heteroskedasticity
for(n in names(models)){
  print(n)
  show(gqtest(models[[n]], point=0.3, alternative="less"))
}
## [1] "raw.size"
## 
##  Goldfeld-Quandt test
## 
## data:  models[[n]]
## GQ = 0.63184, df1 = 505, df2 = 215, p-value = 1.984e-05
## alternative hypothesis: variance decreases from segment 1 to 2
## 
## [1] "log.size"
## 
##  Goldfeld-Quandt test
## 
## data:  models[[n]]
## GQ = 0.625, df1 = 505, df2 = 215, p-value = 1.284e-05
## alternative hypothesis: variance decreases from segment 1 to 2
## 
## [1] "sqrt.size"
## 
##  Goldfeld-Quandt test
## 
## data:  models[[n]]
## GQ = 0.62035, df1 = 505, df2 = 215, p-value = 9.472e-06
## alternative hypothesis: variance decreases from segment 1 to 2
## 
## [1] "double_sqrt.size"
## 
##  Goldfeld-Quandt test
## 
## data:  models[[n]]
## GQ = 0.6204, df1 = 505, df2 = 215, p-value = 9.501e-06
## alternative hypothesis: variance decreases from segment 1 to 2

After one-third of the observations the variance decreases (but not after one-fifth, i.e. for point=0.2).

Harrison-McCabe (hmctest): heteroskedasticity again
for(n in names(models)){
  print(n)
  show(hmctest(models[[n]], point=0.5))
}
## [1] "raw.size"
## 
##  Harrison-McCabe test
## 
## data:  models[[n]]
## HMC = 0.59011, p-value = 0.998
## 
## [1] "log.size"
## 
##  Harrison-McCabe test
## 
## data:  models[[n]]
## HMC = 0.59773, p-value = 1
## 
## [1] "sqrt.size"
## 
##  Harrison-McCabe test
## 
## data:  models[[n]]
## HMC = 0.59844, p-value = 0.999
## 
## [1] "double_sqrt.size"
## 
##  Harrison-McCabe test
## 
## data:  models[[n]]
## HMC = 0.59936, p-value = 0.999

This test checks which fraction of RSS is generated by a given fraction of data, so it basically checks for a trend in residual variance. This won’t work here because the variance of residues is more-or-less symmetric.

Durbin-Watson (dwtest): autocorrelation of residues
for(n in names(models)){
  print(n)
  show(dwtest(models[[n]], alternative = "greater"))
}
## [1] "raw.size"
## 
##  Durbin-Watson test
## 
## data:  models[[n]]
## DW = 0.90326, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
## 
## [1] "log.size"
## 
##  Durbin-Watson test
## 
## data:  models[[n]]
## DW = 0.84417, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
## 
## [1] "sqrt.size"
## 
##  Durbin-Watson test
## 
## data:  models[[n]]
## DW = 0.87276, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
## 
## [1] "double_sqrt.size"
## 
##  Durbin-Watson test
## 
## data:  models[[n]]
## DW = 0.85763, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

The autocorrelation is greater than 0. Check:

cor(double_sqrt_lm$residuals[-1], double_sqrt_lm$residuals[-length(sqrt_lm$residuals)])
## [1] 0.5703214

The residues increase. That’s strange because the data doesn’t seem to be ordered. Check if this autocorrelation is not an artifact:

shuffle <- sample(1:724, 724, replace=F)
cor(double_sqrt_lm$residuals[shuffle][-1], double_sqrt_lm$residuals[shuffle][-length(sqrt_lm$residuals)])
## [1] -0.01167001

There is no autocorrelation in shuffled data, as it should be, so the apparent autocorrelation is not a bug. The data might be ordered somehow, but at first glance I don’t see any ordering.

qplot(1:724, double_sqrt_lm$residuals)

Breusch-Godfrey (bgtest): higher order autocorrelation
for(n in names(models)){
  print(n)
  show(bgtest(models[[n]], order = 5))
}
## [1] "raw.size"
## 
##  Breusch-Godfrey test for serial correlation of order up to 5
## 
## data:  models[[n]]
## LM test = 225.3, df = 5, p-value < 2.2e-16
## 
## [1] "log.size"
## 
##  Breusch-Godfrey test for serial correlation of order up to 5
## 
## data:  models[[n]]
## LM test = 251.97, df = 5, p-value < 2.2e-16
## 
## [1] "sqrt.size"
## 
##  Breusch-Godfrey test for serial correlation of order up to 5
## 
## data:  models[[n]]
## LM test = 238.47, df = 5, p-value < 2.2e-16
## 
## [1] "double_sqrt.size"
## 
##  Breusch-Godfrey test for serial correlation of order up to 5
## 
## data:  models[[n]]
## LM test = 245.48, df = 5, p-value < 2.2e-16
Harvey-Collier (harvtest): linearity
for(n in names(models)){
  print(n)
  show(harvtest(models[[n]], order.by=~size, data=genomes))
}
## [1] "raw.size"
## 
##  Harvey-Collier test
## 
## data:  models[[n]]
## HC = 4.2988, df = 721, p-value = 1.953e-05
## 
## [1] "log.size"
## 
##  Harvey-Collier test
## 
## data:  models[[n]]
## HC = 1.1855, df = 721, p-value = 0.2362
## 
## [1] "sqrt.size"
## 
##  Harvey-Collier test
## 
## data:  models[[n]]
## HC = 1.7856, df = 721, p-value = 0.07459
## 
## [1] "double_sqrt.size"
## 
##  Harvey-Collier test
## 
## data:  models[[n]]
## HC = 0.35363, df = 721, p-value = 0.7237

The relationship tured out to be linear for every transformation but not for un-transformed data, as should be expected.

Rainbow (raintest): linearity
fract = 0.3
centr = 0.8
for(n in names(models)){
  print(n)
  show(raintest(models[[n]], fraction=fract, center=centr, order.by=~size, data=genomes))
}
## [1] "raw.size"
## 
##  Rainbow test
## 
## data:  models[[n]]
## Rain = 0.94989, df1 = 507, df2 = 215, p-value = 0.6784
## 
## [1] "log.size"
## 
##  Rainbow test
## 
## data:  models[[n]]
## Rain = 0.90224, df1 = 507, df2 = 215, p-value = 0.8198
## 
## [1] "sqrt.size"
## 
##  Rainbow test
## 
## data:  models[[n]]
## Rain = 0.90592, df1 = 507, df2 = 215, p-value = 0.8103
## 
## [1] "double_sqrt.size"
## 
##  Rainbow test
## 
## data:  models[[n]]
## Rain = 0.8982, df1 = 507, df2 = 215, p-value = 0.83

I don’t think the rainbow test will do well in this case, because the data is basically “locally linear”, and highly spread for medium genome sizes. This means that after restriction to a fraction of data either each model will fit better (after restriction to small or large genome sizes), or there will be no substantial change (after restriction to medium genome sizes)

test.data <- data.frame('size'=genomes$size, 'GC'=genomes$GC)
test.data <- test.data[as.integer(centr*724 - fract*724*0.5):(centr*724 + fract*724*0.5), ]
test.data <- test.data[order(test.data$size), ]
ggplot(data=genomes) + geom_point(aes(x=size, y=GC))

ggplot(data=test.data) + geom_point(aes(x=size, y=GC))

RESET (resettest): test for correct functional form (linearity)
for(n in names(models)){
  print(n)
  show(resettest(models[[n]]))
}
## [1] "raw.size"
## 
##  RESET test
## 
## data:  models[[n]]
## RESET = 10.703, df1 = 2, df2 = 720, p-value = 2.628e-05
## 
## [1] "log.size"
## 
##  RESET test
## 
## data:  models[[n]]
## RESET = 1.9135, df1 = 2, df2 = 720, p-value = 0.1483
## 
## [1] "sqrt.size"
## 
##  RESET test
## 
## data:  models[[n]]
## RESET = 2.2425, df1 = 2, df2 = 720, p-value = 0.1069
## 
## [1] "double_sqrt.size"
## 
##  RESET test
## 
## data:  models[[n]]
## RESET = 0.64246, df1 = 2, df2 = 720, p-value = 0.5263

The first model is ill-specified, as should be expected. The others are ok. The results are in agreement with those of the Harvey-Collier test.

Shapiro test: normality of residues
for(n in names(models)){
  print(n)
  show(shapiro.test(rstandard(models[[n]])))
}
## [1] "raw.size"
## 
##  Shapiro-Wilk normality test
## 
## data:  rstandard(models[[n]])
## W = 0.98816, p-value = 1.372e-05
## 
## [1] "log.size"
## 
##  Shapiro-Wilk normality test
## 
## data:  rstandard(models[[n]])
## W = 0.99189, p-value = 0.0005388
## 
## [1] "sqrt.size"
## 
##  Shapiro-Wilk normality test
## 
## data:  rstandard(models[[n]])
## W = 0.99193, p-value = 0.0005609
## 
## [1] "double_sqrt.size"
## 
##  Shapiro-Wilk normality test
## 
## data:  rstandard(models[[n]])
## W = 0.99261, p-value = 0.001176
for(n in names(models)){
  print(n)
  show(ks.test(rstandard(models[[n]]), pnorm))
}
## [1] "raw.size"
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  rstandard(models[[n]])
## D = 0.05521, p-value = 0.02422
## alternative hypothesis: two-sided
## 
## [1] "log.size"
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  rstandard(models[[n]])
## D = 0.05711, p-value = 0.01778
## alternative hypothesis: two-sided
## 
## [1] "sqrt.size"
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  rstandard(models[[n]])
## D = 0.052789, p-value = 0.03537
## alternative hypothesis: two-sided
## 
## [1] "double_sqrt.size"
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  rstandard(models[[n]])
## D = 0.053754, p-value = 0.03048
## alternative hypothesis: two-sided

Both tests rejected the hypothesis that the residues are normally distributed. This may be caused by heteroskedasticity.

for(n in names(models)){
  print(n)
  show(qplot(rstandard(models[[n]])))
}
## [1] "raw.size"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "log.size"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "sqrt.size"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "double_sqrt.size"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Homogeneity of the genomes

h_lm <- lm(GC ~ sqrt(sqrt(size)) + group, data=genomes)
summary(h_lm)
## 
## Call:
## lm(formula = GC ~ sqrt(sqrt(size)) + group, data = genomes)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.51270 -0.38227  0.00924  0.36254  2.15650 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      0.346268   0.635538   0.545  0.58604    
## sqrt(sqrt(size))                 2.882327   0.163052  17.677  < 2e-16 ***
## groupActinobacteria              1.088543   0.592758   1.836  0.06672 .  
## groupAlphaproteobacteria         0.482474   0.591827   0.815  0.41522    
## groupAquificae                  -0.340001   0.681919  -0.499  0.61822    
## groupBacteroidetes/Chlorobi     -0.097822   0.600945  -0.163  0.87074    
## groupBetaproteobacteria          0.762479   0.592348   1.287  0.19844    
## groupChlamydiae/Verrucomicrobia  0.136177   0.616029   0.221  0.82511    
## groupChloroflexi                 0.329485   0.634526   0.519  0.60374    
## groupCrenarchaeota               0.373607   0.609124   0.613  0.53984    
## groupCyanobacteria               0.020887   0.599069   0.035  0.97220    
## groupDeinococcus-Thermus         1.706968   0.658326   2.593  0.00972 ** 
## groupDeltaproteobacteria         0.569864   0.601659   0.947  0.34389    
## groupEpsilonproteobacteria      -0.424194   0.604703  -0.701  0.48323    
## groupEuryarchaeota               0.087597   0.598706   0.146  0.88372    
## groupFirmicutes                 -0.653009   0.591869  -1.103  0.27028    
## groupFusobacteria               -1.256735   0.831680  -1.511  0.13122    
## groupGammaproteobacteria        -0.168401   0.589364  -0.286  0.77517    
## groupNanoarchaeota               0.005401   0.838256   0.006  0.99486    
## groupOther Bacteria             -0.139052   0.619192  -0.225  0.82238    
## groupPlanctomycetes             -0.136672   0.828215  -0.165  0.86898    
## groupSpirochaetes               -0.582370   0.608795  -0.957  0.33910    
## groupThermotogae                -0.226928   0.631363  -0.359  0.71938    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5855 on 701 degrees of freedom
## Multiple R-squared:  0.6677, Adjusted R-squared:  0.6572 
## F-statistic: 64.01 on 22 and 701 DF,  p-value: < 2.2e-16
lrtest(h_lm, double_sqrt_lm)
## Likelihood ratio test
## 
## Model 1: GC ~ sqrt(sqrt(size)) + group
## Model 2: GC ~ sqrt(sqrt(size))
##   #Df  LogLik  Df  Chisq Pr(>Chisq)    
## 1  24 -628.03                          
## 2   3 -843.29 -21 430.52  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The difference in groups is significant.

groups <- unique(genomes$group)
gr_models <- list()
for(g in 1:length(groups)){
  d <- genomes[genomes$group == groups[[g]], ]
  gr_models[[g]] <- lm(GC~log(size), data=d, x=T)
}
gr_fit <- sapply(1:length(groups), function(g) gr_models[[g]]$fit)
gr_fit <- unlist(gr_fit)
gr_fit <- gr_fit[order(as.numeric(names(gr_fit)))]
gr_models_summary <- genomes[,c("size", "GC", "group")]
gr_models_summary$fit <- gr_fit

ggplot(data=gr_models_summary) + geom_point(aes(col=group, x=size, y=GC)) + geom_line(aes(x=size, y=fit, col=group))

The relationships are very different for different groups of organisms (sometimes even inverse), suggesting no universal biological connection between genome size and GC content.

gr_nb = 13
groups[[gr_nb]]
## [1] Bacteroidetes/Chlorobi
## 22 Levels: Acidobacteria Actinobacteria Alphaproteobacteria ... Thermotogae
gr_models[[gr_nb]]
## 
## Call:
## lm(formula = GC ~ log(size), data = d, x = T)
## 
## Coefficients:
## (Intercept)    log(size)  
##      4.0883      -0.9068
d <- genomes[genomes$group==groups[[gr_nb]], c("size", "GC")]
d$fit <- gr_models[[gr_nb]]$fitted.values
d$ID <- rownames(d)
ggplot(data=d) + geom_text(aes(x=size, y=GC, label=ID)) + geom_line(aes(x=size, y=fit))

plot(gr_models[[gr_nb]])